# -*- coding: utf-8 -*-
import copy

import scrapy

from meizitu.items import MeizituItem


class MztSpider(scrapy.Spider):
    name = 'mzt'
    allowed_domains = ['meizitu.com']
    start_urls = ['http://www.meizitu.com/']

    def parse(self, response):
        """
        提取标签名称和链接
        :param response:
        :return:
        """

        tags = response.xpath(".//*[@class='tags']/span/a")
        for i in tags:
            item = MeizituItem()
            tag_href = i.xpath(".//@href").extract()[0]
            tag_name = i.xpath(".//@title").extract()[0]
            item['tag_name'] = tag_name
            item['tag_href'] = tag_href
            yield scrapy.Request(url=item['tag_href'], meta={'item': copy.deepcopy(item)}, callback=self.parse_page)

    def parse_page(self, response):
        """
        提取标签下链接
        :param response:
        :return:
        """

        # 进入某个标签后，爬取底部分页按钮
        page_lists = response.xpath(".//*[@id='wp_page_numbers']/ul/li")
        # 获取底部分页按钮上的文字，根据文字来判断当前标签页下总共有多少分页
        page_list = page_lists.xpath('.//text()')
        # 如果当前标签页下有多个页面，则再根据第一个按钮是否为“首页”来进行再次提取，因为这里有的页面第一个按钮是“首页”，有的第一个按钮是“1”
        if len(page_lists) > 0:
            if page_list[0].extract() == '首页':
                page_num = len(page_lists) - 3

            else:
                page_num = len(page_lists) - 2
        else:
            page_num = 1

        # 根据当前标签页的链接，来拼成带页码的链接
        if '_' in response.url:
            index = response.url[::-1].index('_')
            href_pre = response.url[:-index]
        else:
            if page_num == 1:
                href_pre = response.url.split('.html')[0]
            else:
                href_pre = response.url.split('.html')[0] + '_'
        for i in range(1, page_num + 1):
            if page_num == 1:
                href = href_pre + '.html'
            else:
                href = href_pre + str(i) + '.html'
            item = response.meta['item']
            item['page_list'] = href
            # 问题：这里打印item['page_list']还能把所有的url打印出来，而且是正常的，但是一到parse_album里面就有问题，总是只显示最后一个url
            # 解决方案：将原本的meta={'item': item} 修改为 meta={'item': copy.deepcopy(item)}
            # 参考：https://blog.csdn.net/bestbzw/article/details/52894883
            yield scrapy.Request(url=item['page_list'], meta={'item': copy.deepcopy(item)}, callback=self.parse_album)

    def parse_album(self, response):
        """
        提取专辑名称和专辑链接
        :param response:
        :return:
        """

        albums = response.xpath(".//*[@class='pic']")
        for album in albums:
            album_href = album.xpath(".//a/@href").extract()[0]
            album_name = album.xpath(".//a/img/@alt").extract()[0]
            item = response.meta['item']
            item['album_name'] = album_name
            item['album_href'] = album_href

            yield scrapy.Request(url=item['album_href'], meta={'item': copy.deepcopy(item)}, callback=self.parse_img)

    def parse_img(self, response):
        """
        提取图片名称和链接
        :param response:
        :return:
        """

        img_list = response.xpath(".//p/img")

        for img in img_list:
            item = response.meta['item']
            img_title = img.xpath(".//@alt").extract()[0]
            if img_title == '':
                for i in range(1, len(img_list) + 1):
                    img_title = item['album_name'] + '_' + str(i)
            else:
                img_title = img_title
            img_src = img.xpath(".//@src").extract()[0]
            item['img_title'] = img_title
            item['img_src'] = img_src

            yield copy.deepcopy(item)
